1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 The Data

DARWIN <- read.csv("~/GitHub/FCA/Data/DARWIN/DARWIN.csv")
rownames(DARWIN) <- DARWIN$ID
DARWIN$ID <- NULL
DARWIN$class <- 1*(DARWIN$class=="P")
print(table(DARWIN$class))
#> 
#>  0  1 
#> 85 89

DARWIN[,1:ncol(DARWIN)] <- sapply(DARWIN,as.numeric)

signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(DARWIN) %in% c("class"));
DARWIN[,whof] <- signedlog(DARWIN[,whof])

1.2.0.1 Standarize the names for the reporting

studyName <- "DARWIN"
dataframe <- DARWIN
outcome <- "class"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
174 450
pander::pander(table(dataframe[,outcome]))
0 1
85 89

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1000

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]


dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data


if (!largeSet)
{
  
  hm <- heatMaps(data=dataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9992136

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 450 , Uni p: 0.006350853 , Uncorrelated Base: 241 , Outcome-Driven Size: 0 , Base Size: 241 
#> 
#> 
 1 <R=0.999,w=  1,N=   82>, Top: 41( 1 )[ 1 : 41 Fa= 41 : 0.975 ]( 41 , 41 , 0 ),<|>Tot Used: 82 , Added: 41 , Zero Std: 0 , Max Cor: 0.974
#> 
 2 <R=0.974,w=  1,N=   82>, Top: 18( 1 )[ 1 : 18 Fa= 58 : 0.962 ]( 18 , 19 , 41 ),<|>Tot Used: 118 , Added: 19 , Zero Std: 0 , Max Cor: 0.960
#> 
 3 <R=0.960,w=  1,N=   82>, Top: 8( 1 )[ 1 : 8 Fa= 66 : 0.955 ]( 8 , 8 , 58 ),<|>Tot Used: 134 , Added: 8 , Zero Std: 0 , Max Cor: 0.955
#> 
 4 <R=0.955,w=  2,N=   43>, Top: 21( 1 )[ 1 : 21 Fa= 84 : 0.927 ]( 21 , 21 , 66 ),<|>Tot Used: 173 , Added: 21 , Zero Std: 0 , Max Cor: 0.927
#> 
 5 <R=0.927,w=  2,N=   43>, Top: 8( 1 )[ 1 : 8 Fa= 89 : 0.914 ]( 8 , 9 , 84 ),<|>Tot Used: 187 , Added: 9 , Zero Std: 0 , Max Cor: 0.912
#> 
 6 <R=0.912,w=  2,N=   43>, Top: 3( 1 )[ 1 : 3 Fa= 90 : 0.906 ]( 3 , 3 , 89 ),<|>Tot Used: 191 , Added: 3 , Zero Std: 0 , Max Cor: 0.906
#> 
 7 <R=0.906,w=  3,N=  110>, Top: 50( 2 )[ 1 : 50 Fa= 129 : 0.853 ]( 49 , 54 , 90 ),<|>Tot Used: 278 , Added: 54 , Zero Std: 0 , Max Cor: 0.883
#> 
 8 <R=0.883,w=  3,N=  110>, Top: 11( 1 )[ 1 : 11 Fa= 135 : 0.841 ]( 11 , 11 , 129 ),<|>Tot Used: 290 , Added: 11 , Zero Std: 0 , Max Cor: 0.841
#> 
 9 <R=0.841,w=  4,N=   52>, Top: 25( 1 )[ 1 : 25 Fa= 142 : 0.800 ]( 24 , 26 , 135 ),<|>Tot Used: 306 , Added: 26 , Zero Std: 0 , Max Cor: 0.929
#> 
 10 <R=0.929,w=  4,N=   52>, Top: 3( 1 )[ 1 : 3 Fa= 144 : 0.814 ]( 3 , 3 , 142 ),<|>Tot Used: 306 , Added: 3 , Zero Std: 0 , Max Cor: 0.810
#> 
 11 <R=0.810,w=  5,N=    4>, Top: 2( 1 )[ 1 : 2 Fa= 144 : 0.800 ]( 2 , 2 , 144 ),<|>Tot Used: 306 , Added: 2 , Zero Std: 0 , Max Cor: 0.798
#> 
 12 <R=0.798,w=  6,N=    0>
#> 
 [ 12 ], 0.7945852 Decor Dimension: 306 . Cor to Base: 161 , ABase: 118 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

489

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

330

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

4.9

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

4.61

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.798365

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data

classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

1.8.2 The decorralted UMAP


datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : mean_jerk_in_air6 200 : disp_index12 300 : mean_speed_in_air17 400 : gmrt_on_paper23




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : mean_jerk_in_air6 200 : disp_index12 300 : La_mean_speed_in_air17 400 : La_gmrt_on_paper23

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
total_time23 0.767 0.909 -0.366 0.736 6.93e-05 0.863
total_time15 0.775 1.062 -0.442 0.572 4.78e-01 0.844
air_time23 0.599 0.766 -0.374 0.715 2.31e-02 0.844
air_time15 0.684 1.112 -0.506 0.669 7.09e-01 0.829
total_time17 0.806 1.082 -0.400 0.966 3.10e-02 0.824
paper_time23 0.690 1.106 -0.435 0.703 6.55e-01 0.814
air_time17 0.674 0.980 -0.378 0.863 8.86e-02 0.806
paper_time17 0.664 1.045 -0.413 0.929 1.79e-01 0.796
total_time6 0.680 1.069 -0.364 0.665 7.13e-01 0.790
air_time16 0.426 0.841 -0.414 0.650 8.51e-01 0.787


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
total_time23 0.7669 0.9089 -0.3659 0.7356 6.93e-05 0.863
total_time15 0.7754 1.0623 -0.4417 0.5721 4.78e-01 0.844
total_time17 0.8059 1.0820 -0.4001 0.9658 3.10e-02 0.824
paper_time17 0.6641 1.0452 -0.4133 0.9288 1.79e-01 0.796
total_time6 0.6804 1.0693 -0.3643 0.6653 7.13e-01 0.790
disp_index23 0.5808 0.9243 -0.3531 0.8162 3.70e-01 0.787
total_time7 0.5542 0.8587 -0.2404 0.8645 7.47e-03 0.785
total_time16 0.5332 0.8335 -0.3041 0.6411 1.78e-01 0.784
total_time22 0.6670 1.0498 -0.3412 0.6015 4.94e-01 0.780
gmrt_in_air7 -0.4478 0.8115 0.4227 0.7944 9.97e-01 0.775
La_total_time2 0.3630 0.6054 -0.0507 0.4814 2.31e-01 0.707
La_disp_index21 -0.4089 0.6245 -0.0181 0.3042 8.89e-01 0.706
La_air_time21 0.0669 0.3949 -0.1956 0.3518 2.26e-01 0.706
La_paper_time6 0.2865 0.6900 -0.1515 0.5442 4.14e-01 0.698
La_mean_jerk_in_air7 -0.0162 0.0512 -0.0572 0.0722 2.62e-01 0.690

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.07 182 0.404


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
total_time23 0.7669 0.9089 -0.3659 0.7356 6.93e-05 0.863 0.863 2
total_time15 0.7754 1.0623 -0.4417 0.5721 4.78e-01 0.844 0.844 1
total_time17 0.8059 1.0820 -0.4001 0.9658 3.10e-02 0.824 0.824 1
paper_time17 0.6641 1.0452 -0.4133 0.9288 1.79e-01 0.796 0.796 1
total_time6 0.6804 1.0693 -0.3643 0.6653 7.13e-01 0.790 0.790 1
total_time2 NA 0.5383 0.9470 -0.4676 0.7818 7.26e-01 0.787 0.787 NA
disp_index23 0.5808 0.9243 -0.3531 0.8162 3.70e-01 0.787 0.787 NA
total_time7 0.5542 0.8587 -0.2404 0.8645 7.47e-03 0.785 0.785 1
total_time16 0.5332 0.8335 -0.3041 0.6411 1.78e-01 0.784 0.784 2
total_time22 0.6670 1.0498 -0.3412 0.6015 4.94e-01 0.780 0.780 2
gmrt_in_air7 -0.4478 0.8115 0.4227 0.7944 9.97e-01 0.775 0.775 1
paper_time6 NA 0.6807 1.2538 -0.2913 0.7108 8.96e-01 0.729 0.729 NA
disp_index2 NA 0.2160 0.8693 -0.5135 1.0058 3.28e-01 0.719 0.719 2
La_total_time2 -0.812disp_index2 + 1.000total_time2 0.3630 0.6054 -0.0507 0.4814 2.31e-01 0.707 0.787 0
La_disp_index21 + 1.000disp_index21 -0.906paper_time21 -0.4089 0.6245 -0.0181 0.3042 8.89e-01 0.706 0.538 -1
La_air_time21 + 1.000air_time21 -0.847num_of_pendown21 0.0669 0.3949 -0.1956 0.3518 2.26e-01 0.706 0.688 -1
La_paper_time6 + 0.840mean_speed_on_paper6 + 1.000paper_time6 0.2865 0.6900 -0.1515 0.5442 4.14e-01 0.698 0.729 0
mean_jerk_in_air7 NA -0.3593 0.8283 0.1846 1.0872 7.51e-02 0.695 0.695 NA
mean_acc_in_air7 NA -0.3322 0.7984 0.2342 1.0407 1.95e-01 0.695 0.695 1
La_mean_jerk_in_air7 -1.033mean_acc_in_air7 + 1.000mean_jerk_in_air7 -0.0162 0.0512 -0.0572 0.0722 2.62e-01 0.690 0.695 -1
air_time21 NA 0.2100 0.8146 -0.3194 0.7050 7.65e-01 0.688 0.688 NA
mean_speed_on_paper6 NA -0.4693 1.2166 0.1664 0.8458 8.52e-01 0.653 0.653 3
num_of_pendown21 NA 0.1689 0.8947 -0.1461 0.7239 3.54e-01 0.609 0.609 2
paper_time21 NA 0.1398 1.1738 -0.0900 1.0840 8.16e-01 0.542 0.542 NA
disp_index21 NA -0.2822 1.3004 -0.0996 0.9807 7.01e-02 0.538 0.538 NA